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Detailed protocol 


#### Single-cell bioinformatics analysis for thyroid cancer study 


a = 
### Integration of scRNAs-seq data 


# dependency package install 


devtools::install_github(‘satijalab/seurat-wrappers' force = T) 
need_pkgs <- c('Seurat','cowplot’, 'slingshot', 
‘data.table’, 'reticulate’,'dplyr’, 
'RColorBrewer','SingleCellExperiment’) 


for(i in need_pkgs){ 
require(i, quietly = T, character.only = T) 


} 


# setting the file pathway 


setwd('/home/gyeongdaekim/) 
file_list <- list-files() 
print(file_list) 


sampleNames = mapply(function(x) x[length(x)], strsplit(file_list, split = '_')) 
sampleNames <- gsub('-','_';ssampleNames) 

meta.info = data.table(file_Path = file_list, sampleID = sampleNames) 
meta.info[, SubType:=mapply(function(x) x{1], strsplit(samplelD, '_'))] 
DT::datatable(meta.info) 


# setting the integration and QC option 


norm_method = 'standard' 
treat = FALSE 

show = FALSE 
MT_cut=15 

batch_list = list() 


for(i in 1:nrow(meta.info)){ 
dir_of_10X = meta.info$file_Path[i] 
samplelD = meta.info$samplelD[i] 
subtype = meta.info$SubTypef[i] 
seq_batch = meta.info$Batch[i] 
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sc_mat = Read10X(dir_of_10X) 


sc_tumor <- CreateSeuratObject(counts = sc_mat, project = samplelD, 
min.cells = 3, min.features = 200) 
sc_tumor[["percent.mt"]] <- PercentageFeatureSet(sc_tumor, pattern = "“MT-") 


qc = VinPlot(sc_tumor, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 


plot1 <- FeatureScatter(sc_tumor, feature1 = "nCount_RNA", feature2 = "percent.mt") 
plot2 <- FeatureScatter(sc_tumor, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") 


sc_tumor <- subset(sc_tumor, subset = nFeature_RNA > 200 & percent.mt <= MT_cut) 


if(norm_method == 'SCT'){ 
sc_tumor <- SCTransform(sc_tumor, vars.to.regress = 'percent.mt') 
}else{ 
sc_tumor <- NormalizeData(sc_tumor, verbose = F) 
sc_tumor <- FindVariableFeatures(sc_tumor, selection.method = "vst", 
nfeatures = 2000, verbose = FALSE) 


sc_tumor <- ScaleData(sc_tumoer, vrbose = F) 


} 


sc_tumor <- RenameCells(object = sc_tumor, add.cell.id = samplelD) # add sample name as prefix 
if(treat == TRUE) 

sc_tumor <- RunPCA(sc_tumor, verbose = T) 

sc_tumor <- RunUMAP(sc_tumor, dims = 1:30, verbose = FALSE) 

sc_tumor <- RunTSNE(sc_tumor, dims = 1:30, verbose = FALSE) 


sc_tumor <- FindNeighbors(sc_tumor, dims = 1:30, verbose = FALSE) 
sc_tumor <- FindClusters(sc_tumor, verbose = FALSE) 


umap = DimPlot(sc_tumor, label = TRUE) + NoLegend() 
tsne = TSNEPlot(sc_tumor, label = TRUE) + NoLegend() 


if(show == TRUE) 


print(qc) 
plot_grid(plot1 ,plot2) 
#dev.off() 


print(umap) 
print(tsne) 
} 
} 


# add information 

sc_tumor@meta.data[,'SubType'] = subtype 
sc_tumor@meta.data[,,Sample|D'] = sampleID 
sc_tumor@meta.data[,'Batch'] = seq_batch 

### add batch data into a list to merge 
#assign(pasteO(samplelD,'.Seurat'),sc_tumor) 

batch_list <- append(batch_list, sc_tumor) 
print(paste("Finishing processing", samplelD, ‘at’, Sys.time())) 


data.combined <- Reduce(function(x,y){merge(x,y,merge.data = TRUE)},batch_list) 


# QC check 
qc = VInPlot(data.combined, group.by = 'SamplelD', features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,pt.size = 0) 
qc 
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data.combined <- FindVariableFeatures(data.combined, selection.method = "vst", nfeatures = 2000, verbose = FALSE) 
data.combined <- ScaleData(data.combined, vars.to.regress = c(‘nCount_RNA', 'percent.mt’),verbose = F) 


data.combined <- RunPCA(data.combined, npcs=30, verbose=F) 


# selection of the principle component number 
data.combined <- JackStraw(data.combined, num.replicate = 100) 
data.combined <- ScoreJackStraw(data.combined, dims = 1:20) 
JackStrawPlot(data.combined, dims = 1:20) 


ElbowPlot(object = data.combined) 


num_PC = 30 


data.combined <- RunUMAP(data.combined, reduction = 'pca’, dims=1:num_PC) 


data.combined <- FindNeighbors(data.combined, reduction = 'pca’, dims=1:num_PC) 


data.combined <- FindClusters(data.combined, resolution = 1) 


DimPlot(data.combined, label = TRUE) + NoLegend() 
DimPlot(data.combined, label = TRUE, split.by = "orig.ident",ncol = 4) + NoLegend() 


library(SeuratWrappers) 
library(SeuratData) 


data.combined5 <- RunFastMNN(object.list = SplitObject(data.combined, split.by = "SamplelD")) 
data.combined5 <- RunUMAP (data.combined5, reduction = "mnn", dims = 1:30) 
data.combined5 <- FindNeighbors(data.combined5, reduction = "mnn", dims = 1:30) 


data.combined5 <- FindClusters(data.combined5, resolution = 0.7) 


DimPlot(data.combined5, label = TRUE) + NoLegend() 


for(j in O:length(names(data.combined5@active.ident))){ 
n=table(data.combined5@active.ident) 
ident=j 
n[names(n) %in% c(O:ident)|=0 


n=n[-1] 
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U-n[-U] 
ident2=names(o[1]) 

bim <- FindMarkers(data.combined5, ident.1 = ident, ident.2 =ident2, only.pos = F, test.use = "bimod") 
deg=bim[bim$p_val_adj < 0.01,] 

deg1=deg[abs(deg$avg_logFC) >= 1,] 

degs=dim(deg) 


degs1=dim(deg1) 


for(i in c(names(o[-1])) 
bim <- FindMarkers(data.combined§, ident.1 = ident, ident.2 =i, only.pos = F, test.use = "bimod") 
deg=bim[bim$p_val_adj < 0.01,] 
deg1=deg[abs(deg$avg_logFC) >= 1,] 
degs=cbind(degs,dim(deg)) 


degs1=cbind(degs1 ,dim(deg1)) 


print(degs1) 


# merge the similar clusters having the difference of DEGs less than 10 


data.combined5 <- Renameldents(object = data.combined5,"0"="0",  "1"="0", "2"="2", "Z"="0", "4"=ar4n, "5"=s"5", 
"9"="9","40"="10", m4q4qneny 4d", m4o"="4o" "43"="13", "44ns"44", "45"="15", 


# again, calculation of the number of DEGs between clusters 


data.combined5 <- Renameldents(object = data.combined5,"0"="0",  "2"="1", "4™="2", "5"="3" "7"=S"4" NEMsN5", 
"40"="7","141"S"8", "42"="9" "413"="40", "44"="4 4", "45"="12", "46"="13", "47"="44", 


# marker of each cluster 


"46"="16", 


data.combined5.marker <- FindAllMarkers(data.combined5, max.cells.per.ident = 100, min.diff.pct = 0.3, only.pos = TRUE) 


write.csv(data.combined5.marker, file = "marker.csv') 


FeaturePlot(data.combined5, features = c("SLC34A2"), cols = c("yellow2","DARKGREEN","BLUE4")) 


# annotation each cluster 


new.cluster.ids <- c("PTC", "NK cell", "Macrophage", "Follicular", "T cell", "Endothelium", 
"Fibroblast2", "B cell", "Mast", "ATC", "Activated T cell", "TAMC", "Parafollicular") 


names(new.cluster.ids) <- levels(data.combined5) 
data.combined5 <- Renameldents(data.combined5, new.cluster.ids) 
# sort cell types 


ident <- c("Follicular", "PTC", "PTC2", "ATC", "Parafollicular", "Fibroblast1", "Fibroblast2", 
cell", "Mast", "Activated T cell", "Macrophage", "Macrophage2", "TAMC") 


data.combined5@active.ident<- factor(x = data.combined5@active.ident, levels = ident) 
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DimPlot(data.combined5, label = TRUE, split.by = "SubType" , ncol = 4, label.size = 0) + NoLegend() 


modify_vinplot<- function(obj, 
feature, 
pt.size = 0, 
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"), 
){ 
p<- VinPlot(obj, features = feature, pt.size = pt.size, ....) + 
xlab("") + ylab(feature) + ggtitle("") + 
theme(legend.position = "none", 
axis.text.x = element_blank(), 
axis.ticks.x = element_blank(), 
axis.title.y = element_text(size = rel(1), angle = 0), 
axis.text.y = element_text(size = rel(1)), 
plot.margin = plot.margin ) 
return(p) 


} 


## extract the max value of the y axis 

extract_max<- function(p){ 
ymax<- max(ggplot_build(p)$layout$panel_scales_y/[1]]$range$range) 
return(ceiling(ymax)) 


} 


## main function 
StackedVInPlot<- function(obj, features, 


pt.size = 0, 
plot.margin = unit(c(-0.75, 0, -0.75, 0), "cm"), 
“4 


plot_list<- purrr::map(features, function(x) modify_vinplot(obj = obj,feature = x, ...)) 


# Add back x-axis title to bottom plot. patchwork is going to support this? 
plot_list[[length(plot_list)]]<- plot_list{[length(plot_list)]] + 
theme(axis.text.x=element_text(), axis.ticks.x = element_line()) 


# change the y-axis tick to only max value 

ymaxs<- purrr::map_dbl(plot_list, extract_max) 

plot_list<- purrr::map2(plot_list, ymaxs, function(x,y) x + 
scale_y_continuous(breaks = c(y)) + 
expand_limits(y = y)) 


p<- patchwork::wrap_plots(plotlist = plot_list, ncol = 1) 


return(p) 


} 


features <- cluster marker 
StackedVInPlot(obj = data.combined5, features = features) 


save(data.combined5, file="/home/gyeongdaekim/thyroid/object/data.combined.RData") 


#### cancer progression trajectory from Seurat object 
[ee = 


library(monocle) 


# isolation of specific cell types 


Fol<-subset(data.combined5, cells=names(data.combined5 @active.ident[data.combined5@active.ident %in% c("Follicular")])) 
Fol<-subset(Fol, cells=names(Fol@active.ident[Fol$SubType%in% c("NOM","PTC","ATC")})) 


ATC<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("ATC")])) 
ATC<-subset(ATC, cells=names(ATC @active.ident[ATC$SubType %in% c("ATC","PTC")])) 


PTC<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5$orig.ident %in% c("PTC_XHY1_190702", 
"PTC_XHY2_190702")])) 
PTC<-subset(PTC, cells=names(PTC @active.ident[PTC @active.ident %in% c("PTC")])) 


aPTC<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("PTC")])) 
aPTC<-subset(aPTC, cells=names(aPTC @active.ident[aPTC$SubType %in% c("ATC")})) 


PTC2<-subset(data.combined5, cells=names(data.combined5@active.ident[data.combined5@active.ident %in% c("PTC2")])) 
PTC2<-subset(PTC2, cells=names(PTC2@active.ident[PTC2$SubType %in% c("PTC")])) 


a<-subset(Fol, cells=names(Fol@active.ident[Fol$SubType %in% c("NOM")})) 
b<-subset(Fol, cells=names(Fol@active.ident[Fol}SubType %in% c("PTC")])) 
c<-subset(Fol, cells=names(Fol@active.ident[FolfSubType %in% c("ATC")])) 


e<-subset(ATC, cells=names(ATC @active.ident[ATC$SubType %in% c("PTC")})) 
f<-subset(ATC, cells=names(ATC @active.ident[ATC$SubType %in% c("ATC")])) 


Idents(object =Fol, cells =colnames(a))="NOM_follicular" 
Idents(object =Fol, cells =colnames(b))="PTC_follicular" 


Idents(object =Fol, cells =colnames(c))="ATC_follicular" 


Idents(object =ATC, cells =colnames(e))="pATC" 
Idents(object =ATC, cells =colnames(f))="aATC" 


Idents(object =PTC, cells =colnames(PTC))="pPTC" 
Idents(object =aPTC, cells =colnames(aPTC))="aPTC" 


table(Fol@active.ident) 
table(ATC @active.ident) 


thyroid <- merge(Fol, y= c(ATC,PTC,PTC2,aPTC), project = "thyroid") 


# monocle2 workflow 
table(thyroid@active.ident) 
thyroid$nCount_RNA=NULL 
thyroid$nFeature_RNA=NULL 
thyroid$percent.mt=NULL 
thyroid$orig.ident<-thyroid@active.ident 
thyroid@meta.data=thyroid@meta.datal, 1:3] 


x=GetAssayData(object = thyroid, slot = "counts")[rownames(GetAssayData(object = thyroid, slot = "counts")) %in% rownames(GetAssayData(object = 
thyroid)), colnames(GetAssayData(object = thyroid, slot = "counts")) %in% colnames(GetAssayData (object = thyroid))] 


target=x[,colnames(x) %in% rownames(thyroid@meta.data)] 
target<-as.matrix(target) 

target=as.matrix(t(target)) 

name=merge(thyroid@meta.data, target, by="row.names") 
rownames(name)=name$Row.names 

name=name[,-1] 
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thyroid@meta.data=name[,1:4] 
name=name[,-1:-3] 
target=t(name) 

target[1:5,1:4] 
g=as.data.frame(rownames(x)) 
rownames(g)=rownames(x) 


colnames(g)=c("gene_short_name") 


pd <- new("AnnotatedDataFrame", data = thyroid@meta.data) 

fd <- new("AnnotatedDataFrame", data = g) 

HSMM <- newCellDataSet(as.matrix(target), phenoData = pd, featureData = fd, expressionFamily=negbinomial.size()) 
HSMM <- estimateSizeFactors(HSMM) 

HSMM <- estimateDispersions(HSMM) 

HSMM <- detectGenes(HSMM, min_expr = 0.1) 

print(head(fData(HSMM))) 

expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10)) 

disp_table <- dispersionTable(HSMM) 

ordering_genes <- subset(disp_table, mean_expression >= 0.05 & dispersion_empirical >= 2 * dispersion_fit)$gene_id 
HSMM <- setOrderingFilter(HSMM, ordering_genes) 

plot_ordering_genes(HSMM) 

HSMM <- reduceDimension(HSMM, max_components=2) 


HSMM <- orderCells(HSMM, reverse=FALSE) 


coordinate<-t(HSMM@reducedDimS) 
head(coordinate) 

colnames(coordinate )<-c("x","y") 
coordinate<-as.data.frame(coordinate) 
coordinate<-cbind(-coordinate$y,coordinate$x) 
coordinate<-t(coordinate) 
HSMM@reducedDimS <-coordinate 


colnames(HSMM@reducedDimS)<-colnames(HSMM) 


plot_cell_trajectory(HSMM, show_tree = FALSE ,show_backbone = FALSE, color_by="orig.ident", show_branch_points = FALSE, cell_size=1) 


plot_cell_trajectory(HSMM, show_tree = FALSE ,show_backbone = FALSE, color_by="orig.ident", show_branch_points = FALSE, cell_size=1) + 
scale color manual(breaks = c("aPTC", "pATC","ATC" "PTC", "PTC2", "NOM follicular","PTC follicular", "ATC follicular"), values=c("black", 


"darkorchid","darkseagreen2", "dodgerblue3", "indianred3","yellow2","turquoise3","RED1")) + theme(legend.position = "right") 


pseudo=t(HSMM@reducedDimS) 
colnames(pseudo)=c("“component1","component2") 
expression=target["COL1A1",] 
expression[expression>5]=5 


ggplot() + geom_point(data=as.data.frame(pseudo), aes(component1, component2, color=expression), size=1.5) + scale_colour_gradient(low = "grey" high = 


HSMM_subset <- HSMM2[,colnames(HSMM2)%in% rownames(subset(pData(HSMM2), State %in% c("1","2")))] 
HSMM_subset@reducedDimS<-HSMM_subset@reducedDimS|,colnames(HSMM_subset@reducedDimS)%in%colnames(HSMML_subset)] 
HSMM_expressed_genes <- row.names(subset(fData(HSMM_subset), num_cells_expressed >= 10)) 

HSMM_expressed_genes <- row.names(subset(fData(HSMM), num_cells_expressed >= 10)) 

HSMM_filtered <- HSMM[HSMM_expressed_genes,] 

HSMM_filtered <- HSMM_subset[HSMM_expressed_genes,] 

diff_test_res <- differentialGeneTest(HSMM_filtered, fullModelFormulaStr="~sm.ns(Pseudotime)") 

diff=diff_test_res[,c("gene_short_name", "pval", "qval","use_for_ordering")] 


diff=difffdiff$qval < 0.01, ] 


diff=diffforder(diff$qval),] 


genelist <- row.names(diff_test_res[order(diff_test_res$qval)[1:800],]) 
genelist<-as.matrix(genelist) 


plot_pseudotime_heatmap(HSMM{genelist,], num_clusters = 2, cores =1, show_rownames = T) 


pseudotime_list<-plot_pseudotime_heatmap(HSMM{[genelist,],num_clusters = 2, cores = 1,show_rownames = T, return_heatmap = T) 
pseudotime_list <- as.data.frame(cutree(pseudotime_list $tree_row, k=2)) 

colnames(pseudotime_list) <- "Cluster" 

pseudotime_list $Gene <- rownames(pseudotime_list) 


head(pseudotime_list) 


write.csv(pseudotime_list,file ="coordinate.csv', sep ="," ) 
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